Quick example of Latent Profile Analysis in R

library(tidyverse)
survey <- read_csv("https://raw.githubusercontent.com/whipson/tidytuesday/master/young_people.csv") %>%
  select(History:Pets)
dim(survey)
## [1] 1010   32
head(survey)
library(careless)
library(psych)

interests <- survey %>%
  mutate(string = longstring(.)) %>%
  mutate(md = outlier(., plot = FALSE))
head(interests)
cutoff <- (qchisq(p = 1 - .001, df = ncol(interests)))
cutoff
## [1] 65.24722
interests_clean <- interests %>%
  filter(string <= 10,
         md < cutoff) %>%
  select(-string, -md)
dim(interests_clean)
## [1] 997  32
head(interests_clean)
library(mclust)

interests_clustering <- interests_clean %>%
  na.omit() %>%
  mutate_all(list(scale))  # scale to see the differences more clearly
head(interests_clustering)
BIC <- mclustBIC(interests_clustering)
plot(BIC)

mclustModelNames
## function (model) 
## {
##     type <- switch(EXPR = as.character(model), E = "univariate, equal variance", 
##         V = "univariate, unequal variance", EII = "spherical, equal volume", 
##         VII = "spherical, varying volume", EEI = "diagonal, equal volume and shape", 
##         VEI = "diagonal, equal shape", EVI = "diagonal, equal volume, varying shape", 
##         VVI = "diagonal, varying volume and shape", EEE = "ellipsoidal, equal volume, shape and orientation", 
##         EVE = "ellipsoidal, equal volume and orientation", VEE = "ellipsoidal, equal shape and orientation", 
##         VVE = "ellipsoidal, equal orientation", EEV = "ellipsoidal, equal volume and shape", 
##         VEV = "ellipsoidal, equal shape", EVV = "ellipsoidal, equal volume", 
##         VVV = "ellipsoidal, varying volume, shape, and orientation", 
##         X = "univariate normal", XII = "spherical multivariate normal", 
##         XXI = "diagonal multivariate normal", XXX = "ellipsoidal multivariate normal", 
##         warning("invalid model"))
##     return(list(model = model, type = type))
## }
## <bytecode: 0x0000000018554338>
## <environment: namespace:mclust>
summary(BIC)
## Best BIC values:
##             VVE,3       VEE,3      EVE,3
## BIC      -75042.7 -75165.1484 -75179.165
## BIC diff      0.0   -122.4442   -136.461
mod1 <- Mclust(interests_clustering, modelNames = "VEE", G = 3, x = BIC)
summary(mod1)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 3
## components: 
## 
##  log-likelihood   n  df       BIC       ICL
##       -35455.83 874 628 -75165.15 -75216.14
## 
## Clustering table:
##   1   2   3 
## 137 527 210
ICL <- mclustICL(interests_clustering)
plot(ICL)

summary(ICL)
## Best ICL values:
##              VVE,3        VEE,3      EVE,3
## ICL      -75134.69 -75216.13551 -75272.891
## ICL diff      0.00    -81.44795   -138.203
mclustBootstrapLRT(interests_clustering, modelName = "VEE")
## ------------------------------------------------------------- 
## Bootstrap sequential LRT for the number of mixture components 
## ------------------------------------------------------------- 
## Model        = VEE 
## Replications = 999 
##               LRTS bootstrap p-value
## 1 vs 2    197.0384             0.001
## 2 vs 3    684.8743             0.001
## 3 vs 4   -124.1935             1.000

Visualizing LPA

library(reshape2)
means <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
  rownames_to_column() %>%
  rename(Interest = rowname) %>%
  melt(id.vars = "Interest", variable.name = "Profile", value.name = "Mean") %>%
  mutate(Mean = round(Mean, 2),
         Mean = ifelse(Mean > 1, 1, Mean))
round(table(mod1$classification)/nrow(interests_clustering),2)
## 
##    1    2    3 
## 0.16 0.60 0.24
means
means %>%
  ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
  geom_point(size = 2.25) +
  geom_line(size = 1.25) +
  scale_x_discrete(limits = c("Active sport", "Adrenaline sports", "Passive sport",
                              "Countryside, outdoors", "Gardening", "Cars",
                              "Art exhibitions", "Dancing", "Musical instruments",
                              "Theatre", "Writing", "Reading","Geography", 
                              "History", "Law", "Politics", "Psychology", "Religion", 
                              "Foreign languages", "Biology", "Chemistry", "Mathematics", 
                              "Medicine", "Physics", "Science and technology",
                              "Internet", "PC","Celebrities", "Economy Management",
                              "Fun with friends", "Shopping", "Pets")) +
  labs(x = NULL, y = "Standardized mean interest") +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")

library(knitr)
library(kableExtra)
data <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
  rownames_to_column() %>%
  rename(Interest = rowname) %>%
  mutate(X1 = round(X1, 2),
         X1 = ifelse(X1 > 1, 1, X1),
         X2 = round(X2, 2),
         X2 = ifelse(X2 > 1, 1, X2),
         X3 = round(X3, 2),
         X3 = ifelse(X3 > 1, 1, X3))
         
data %>%
  mutate(
    X1 = cell_spec(X1, "html", color = ifelse(X1 > 0.3, "red", "black")),
    X2 = cell_spec(X2, "html", color = ifelse(X2 > 0.3, "red", "black")),
    X3 = cell_spec(X3, "html", color = ifelse(X3 > 0.3, "red", "black"))) %>%
  kable(format = "html", escape = F) %>%
  kable_styling("striped", full_width = F)
Interest X1 X2 X3
History -0.11 -0.08 0.28
Psychology -0.13 -0.13 0.43
Politics -0.21 -0.01 0.15
Mathematics 0.03 0 -0.03
Physics 0.25 -0.1 0.08
Internet -0.14 0.08 -0.11
PC -0.18 0.08 -0.09
Economy Management -0.48 0.15 -0.06
Biology 1 -0.35 0.01
Chemistry 1 -0.42 -0.08
Reading 0.18 -0.23 0.47
Geography -0.1 -0.06 0.23
Foreign languages -0.1 -0.06 0.23
Medicine 1 -0.37 0.06
Law -0.13 -0.02 0.14
Cars -0.09 0.14 -0.3
Art exhibitions -0.12 -0.18 0.53
Religion 0.05 -0.11 0.25
Countryside, outdoors 0.06 -0.03 0.03
Dancing 0.16 -0.12 0.21
Musical instruments 0.01 -0.18 0.47
Writing -0.41 -0.52 1
Passive sport -0.02 0.07 -0.18
Active sport -0.01 0 0
Gardening 0.28 -0.19 0.29
Celebrities -0.03 0.02 -0.03
Shopping 0.02 -0.01 0.01
Science and technology 0.26 -0.07 0
Theatre 0.02 -0.15 0.36
Fun with friends 0.12 0.02 -0.13
Adrenaline sports -0.06 0 0.04
Pets 0.28 -0.08 0.01
p <- means %>%
  mutate(Profile = recode(Profile, 
                          X1 = "Science: 16%",
                          X2 = "Disinterest: 60%",
                          X3 = "Arts & Humanities: 24%")) %>%
  ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
  geom_point(size = 2.25) +
  geom_line(size = 1.25) +
  scale_x_discrete(limits = c("Active sport", "Adrenaline sports", "Passive sport",
                              "Countryside, outdoors", "Gardening", "Cars",
                              "Art exhibitions", "Dancing", "Musical instruments", 
                              "Theatre", "Writing", "Reading", "Geography",
                              "History", "Law", "Politics", "Psychology", "Religion",
                              "Foreign languages", "Biology", "Chemistry", "Mathematics", 
                              "Medicine", "Physics", "Science and technology",
                              "Internet", "PC", "Celebrities", "Economy Management",
                              "Fun with friends", "Shopping", "Pets")) +
  labs(x = NULL, y = "Standardized mean interest") +
  theme_bw(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")

p

library(plotly)
ggplotly(p, tooltip = c("Interest", "Mean")) %>%
  layout(legend = list(orientation = "h", y = 1.2))